Mohamedlamine Amimi mohamedlamine.amimi@supinfo.com Théo Bezin theo.bezin@supinfo.com Alexis Lecomte alexis.lecomte@supinfo.com Maxence Pawlowski maxence.pawlowski@supinfo.com
# Calculs mathématiques
import math
import numpy
# Chargement et prise en charge des données
import pandas
# Prise en charge des données géographique
import geopandas
import folium
from folium import plugins
# Création de statistiques et de graphiques
import scipy.stats
import matplotlib.pyplot as pyplot
from matplotlib.ticker import NullFormatter
import plotly.express as px
# Autres
import os
import re
import time
from datetime import datetime
from tabulate import tabulate
import webbrowser
from colorama import Fore, Style
%matplotlib notebook
pyplot.style.use("ggplot")
pyplot.rcParams["figure.figsize"] = [17, 7]
pyplot.rcParams["figure.max_open_warning"] = 30
data_wp = pandas.read_csv("../data/WahsingtonPostDatabase.csv", parse_dates=["date"], date_parser=pandas.to_datetime)
data_wp.dtypes
id int64 name object date datetime64[ns] manner_of_death object armed object age float64 gender object race object city object state object signs_of_mental_illness bool threat_level object flee object body_camera bool longitude float64 latitude float64 is_geocoding_exact bool dtype: object
Nous vérifions la qualité du dataset.
data_wp.isna().sum()
id 0 name 353 date 0 manner_of_death 0 armed 209 age 413 gender 7 race 1323 city 0 state 0 signs_of_mental_illness 0 threat_level 0 flee 653 body_camera 0 longitude 711 latitude 711 is_geocoding_exact 0 dtype: int64
Nous nous débarrassons des données nulles.
data_wp["name"] = data_wp["name"].fillna("Unknown") # replace NaN by unknown
data_wp["armed"] = data_wp["armed"].fillna("Unknown") # replace NaN by unknown
data_wp["gender"] = data_wp["gender"].fillna("Unknown") # replace NaN by unknown
data_wp["race"] = data_wp["race"].fillna("Unknown") # replace NaN by unknown
data_wp["threat_level"] = data_wp["threat_level"].fillna("Unknown") # replace NaN by unknown
data_wp["flee"] = data_wp["flee"].fillna("Unknown") # replace NaN by unknown
data_wp["latitude"] = data_wp["latitude"].fillna("Unknown") # replace NaN by unknown
data_wp["longitude"] = data_wp["longitude"].fillna(0) # replace NaN by unknown
age_mean = data_wp["age"].mean()
data_wp["age"] = data_wp["age"].fillna(round(age_mean)) # replace NaN age by mean
data_wp.head()
| id | name | date | manner_of_death | armed | age | gender | race | city | state | signs_of_mental_illness | threat_level | flee | body_camera | longitude | latitude | is_geocoding_exact | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | Tim Elliot | 2015-01-02 | shot | gun | 53.0 | M | A | Shelton | WA | True | attack | Not fleeing | False | -123.122 | 47.247 | True |
| 1 | 4 | Lewis Lee Lembke | 2015-01-02 | shot | gun | 47.0 | M | W | Aloha | OR | False | attack | Not fleeing | False | -122.892 | 45.487 | True |
| 2 | 5 | John Paul Quintero | 2015-01-03 | shot and Tasered | unarmed | 23.0 | M | H | Wichita | KS | False | other | Not fleeing | False | -97.281 | 37.695 | True |
| 3 | 8 | Matthew Hoffman | 2015-01-04 | shot | toy weapon | 32.0 | M | W | San Francisco | CA | True | attack | Not fleeing | False | -122.422 | 37.763 | True |
| 4 | 9 | Michael Rodriguez | 2015-01-04 | shot | nail gun | 39.0 | M | H | Evans | CO | False | attack | Not fleeing | False | -104.692 | 40.384 | True |
Nous vérifions les cases vides de chaque colonne de données.
data_wp.isna().sum()
id 0 name 0 date 0 manner_of_death 0 armed 0 age 0 gender 0 race 0 city 0 state 0 signs_of_mental_illness 0 threat_level 0 flee 0 body_camera 0 longitude 0 latitude 0 is_geocoding_exact 0 dtype: int64
nb_data = data_wp.shape[1] * data_wp.shape[0]
nb_data
123624
data_wp.isna().sum().sum() / nb_data * 100
0.0
Nous avons 0 % de données manquantes après le nettoyage des données.
data_wp.duplicated().sum()
0
Aucune duplication dans le fichier.
Le coefficient d'asymétrie (pour la loi normale skew=0)
(data_wp.skew(numeric_only=True) - 0).sort_values()
is_geocoding_exact -20.029178 id -0.015711 age 0.758072 signs_of_mental_illness 1.376367 longitude 1.561354 body_camera 2.073920 dtype: float64
data_wp.age.median() # the middle valuex c'est 36
36.0
data_wp.age.mode() # the most repeated value est 37
0 37.0 Name: age, dtype: float64
data_wp.age.mean() # the average value age 37 ans
37.12898789878988
wp_age_fig, wp_age_ax = pyplot.subplots()
_ = data_wp.age.plot(ax=wp_age_ax, kind="box", vert=False)
data_wp.race.value_counts(normalize=True) * 100 # Transformation 41 % de W et 21% B attention au moins 18% ne sont pas connus
W 41.570407 B 21.933443 Unknown 18.193069 H 14.961496 A 1.443894 N 1.251375 O 0.646315 Name: race, dtype: float64
wp_race_fig, wp_race_ax = pyplot.subplots()
_ = data_wp.race.value_counts().plot(ax=wp_race_ax, kind="bar")
data_wp.gender.value_counts(normalize=True) * 100 # Transformation % 95% sont des hommes
M 95.393289 F 4.510451 Unknown 0.096260 Name: gender, dtype: float64
wp_age2_fig, wp_age2_ax = pyplot.subplots()
_ = data_wp.hist("age", ax=wp_age2_ax)
wp_gender_fig, wp_gender_ax = pyplot.subplots()
_ = data_wp.gender.value_counts().plot(ax=wp_gender_ax, kind="pie", autopct="%1.2f%%")
data_wp.flee.value_counts(normalize=True) * 100 # Transformation % plus de la moitié des arrestation se font sans tentative de fuite
Not fleeing 58.182068 Car 16.240374 Foot 12.953795 Unknown 8.979648 Other 3.644114 Name: flee, dtype: float64
wp_flee_fig, wp_flee_ax = pyplot.subplots()
_ = data_wp.flee.value_counts().plot(ax=wp_flee_ax, kind="pie", autopct="%1.2f%%")
data_wp.manner_of_death.value_counts(normalize=True) * 100 # Transformation %
shot 95.338284 shot and Tasered 4.661716 Name: manner_of_death, dtype: float64
data_wp.armed.value_counts(normalize=True) * 100 # Transformation % # 57 % des personnes étaient armées d'un pistolet
gun 57.109461
knife 14.425193
unarmed 6.091859
vehicle 3.616612
undetermined 3.492849
...
baseball bat and bottle 0.013751
pen 0.013751
chainsaw 0.013751
gun and sword 0.013751
machete and hammer 0.013751
Name: armed, Length: 100, dtype: float64
wp_armed_fig, wp_armed_ax = pyplot.subplots()
_ = data_wp.armed.value_counts().plot(ax=wp_armed_ax, kind="barh", figsize=(20, 12))
data_wp.state.value_counts(normalize=True) * 100 # Transformation % 14% des crimes se passes en CA suivit de 8 au TX et 6 a FL
CA 14.576458 TX 8.800880 FL 6.339384 AZ 4.579208 GA 3.712871 CO 3.589109 OK 2.805281 NC 2.805281 TN 2.777778 OH 2.722772 WA 2.722772 MO 2.626513 NM 2.117712 LA 2.062706 PA 2.062706 IL 2.007701 AL 1.897690 IN 1.883938 NY 1.828933 KY 1.828933 VA 1.746425 MI 1.718922 NV 1.663916 OR 1.608911 SC 1.567657 WI 1.526403 AR 1.402640 MD 1.402640 MS 1.292629 UT 1.168867 NJ 1.168867 MN 1.100110 KS 0.907591 WV 0.907591 ID 0.838834 MA 0.728823 AK 0.673817 MT 0.632563 IA 0.618812 NE 0.508801 HI 0.508801 ME 0.412541 SD 0.330033 CT 0.302530 DC 0.288779 WY 0.288779 NH 0.275028 DE 0.233773 ND 0.206271 VT 0.151265 RI 0.068757 Name: state, dtype: float64
wp_state_plt = data_wp.state.value_counts()
wp_state_fig, wp_state_ax = pyplot.subplots()
_ = wp_state_plt.plot(ax=wp_state_ax, kind="barh", figsize=(20, 12)) # l'état comprenant le plus de crime est la Californie
state_data = data_wp.state.value_counts().rename_axis("state_code").reset_index(name="number_of_deaths")
state_data
| state_code | number_of_deaths | |
|---|---|---|
| 0 | CA | 1060 |
| 1 | TX | 640 |
| 2 | FL | 461 |
| 3 | AZ | 333 |
| 4 | GA | 270 |
| 5 | CO | 261 |
| 6 | OK | 204 |
| 7 | NC | 204 |
| 8 | TN | 202 |
| 9 | OH | 198 |
| 10 | WA | 198 |
| 11 | MO | 191 |
| 12 | NM | 154 |
| 13 | LA | 150 |
| 14 | PA | 150 |
| 15 | IL | 146 |
| 16 | AL | 138 |
| 17 | IN | 137 |
| 18 | NY | 133 |
| 19 | KY | 133 |
| 20 | VA | 127 |
| 21 | MI | 125 |
| 22 | NV | 121 |
| 23 | OR | 117 |
| 24 | SC | 114 |
| 25 | WI | 111 |
| 26 | AR | 102 |
| 27 | MD | 102 |
| 28 | MS | 94 |
| 29 | UT | 85 |
| 30 | NJ | 85 |
| 31 | MN | 80 |
| 32 | KS | 66 |
| 33 | WV | 66 |
| 34 | ID | 61 |
| 35 | MA | 53 |
| 36 | AK | 49 |
| 37 | MT | 46 |
| 38 | IA | 45 |
| 39 | NE | 37 |
| 40 | HI | 37 |
| 41 | ME | 30 |
| 42 | SD | 24 |
| 43 | CT | 22 |
| 44 | DC | 21 |
| 45 | WY | 21 |
| 46 | NH | 20 |
| 47 | DE | 17 |
| 48 | ND | 15 |
| 49 | VT | 11 |
| 50 | RI | 5 |
data_wp["year"] = pandas.DatetimeIndex(data_wp["date"]).year # regrouper la date par an pour avoir une vision plus globale
year_data = data_wp["year"].value_counts().rename_axis("year").reset_index(name="deaths").sort_values(by="deaths")
year_data
| year | deaths | |
|---|---|---|
| 7 | 2022 | 281 |
| 6 | 2016 | 958 |
| 5 | 2017 | 981 |
| 4 | 2018 | 985 |
| 3 | 2015 | 994 |
| 2 | 2019 | 999 |
| 1 | 2020 | 1020 |
| 0 | 2021 | 1054 |
state_plt = data_wp.year.value_counts()
wp_state2_fig, wp_state2_ax = pyplot.subplots()
_ = state_plt.plot(ax=wp_state2_ax, kind="bar")
data_wp.head()
| id | name | date | manner_of_death | armed | age | gender | race | city | state | signs_of_mental_illness | threat_level | flee | body_camera | longitude | latitude | is_geocoding_exact | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | Tim Elliot | 2015-01-02 | shot | gun | 53.0 | M | A | Shelton | WA | True | attack | Not fleeing | False | -123.122 | 47.247 | True | 2015 |
| 1 | 4 | Lewis Lee Lembke | 2015-01-02 | shot | gun | 47.0 | M | W | Aloha | OR | False | attack | Not fleeing | False | -122.892 | 45.487 | True | 2015 |
| 2 | 5 | John Paul Quintero | 2015-01-03 | shot and Tasered | unarmed | 23.0 | M | H | Wichita | KS | False | other | Not fleeing | False | -97.281 | 37.695 | True | 2015 |
| 3 | 8 | Matthew Hoffman | 2015-01-04 | shot | toy weapon | 32.0 | M | W | San Francisco | CA | True | attack | Not fleeing | False | -122.422 | 37.763 | True | 2015 |
| 4 | 9 | Michael Rodriguez | 2015-01-04 | shot | nail gun | 39.0 | M | H | Evans | CO | False | attack | Not fleeing | False | -104.692 | 40.384 | True | 2015 |
data_pd = pandas.read_csv("../data/PoliceDeaths.csv", index_col=None, parse_dates=["date"], date_parser=pandas.to_datetime)
data_pd["state"] = data_pd["state"].map(lambda state: state.strip())
pd_state_data = pandas.DataFrame(
data=data_pd["state"].value_counts().reset_index().values,
columns=["State", "Number of deaths"]
).sort_values("State", ascending=True)
data_pd
| person | dept | eow | cause | cause_short | date | year | canine | dept_name | state | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Constable Darius Quimby | Albany County Constable's Office, NY | EOW: Monday, January 3, 1791 | Cause of Death: Gunfire | Gunfire | 1791-01-03 | 1791 | False | Albany County Constable's Office | NY |
| 1 | Sheriff Cornelius Hogeboom | Columbia County Sheriff's Office, NY | EOW: Saturday, October 22, 1791 | Cause of Death: Gunfire | Gunfire | 1791-10-22 | 1791 | False | Columbia County Sheriff's Office | NY |
| 2 | Deputy Sheriff Isaac Smith | Westchester County Sheriff's Department, NY | EOW: Thursday, May 17, 1792 | Cause of Death: Gunfire | Gunfire | 1792-05-17 | 1792 | False | Westchester County Sheriff's Department | NY |
| 3 | Marshal Robert Forsyth | United States Department of Justice - United S... | EOW: Saturday, January 11, 1794 | Cause of Death: Gunfire | Gunfire | 1794-01-11 | 1794 | False | United States Department of Justice - United S... | US |
| 4 | Sheriff Robert Maxwell | Greenville County Sheriff's Office, SC | EOW: Sunday, November 12, 1797 | Cause of Death: Gunfire | Gunfire | 1797-11-12 | 1797 | False | Greenville County Sheriff's Office | SC |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 22795 | K9 Bruno | Amarillo Police Department, TX | EOW: Sunday, June 12, 2016 | Cause of Death: Accidental | Accidental | 2016-06-12 | 2016 | True | Amarillo Police Department | TX |
| 22796 | K9 Lazer | United States Department of Homeland Security ... | EOW: Monday, June 20, 2016 | Cause of Death: Heat exhaustion | Heat exhaustion | 2016-06-20 | 2016 | True | United States Department of Homeland Security ... | US |
| 22797 | K9 Tyson | Fountain County Sheriff's Office, IN | EOW: Monday, June 27, 2016 | Cause of Death: Heat exhaustion | Heat exhaustion | 2016-06-27 | 2016 | True | Fountain County Sheriff's Office | IN |
| 22798 | K9 Credo | Long Beach Police Department, CA | EOW: Tuesday, June 28, 2016 | Cause of Death: Gunfire (Accidental) | Gunfire (Accidental) | 2016-06-28 | 2016 | True | Long Beach Police Department | CA |
| 22799 | K9 Roscoe | Emmett Police Department, ID | EOW: Friday, July 1, 2016 | Cause of Death: Automobile accident | Automobile accident | 2016-07-01 | 2016 | True | Emmett Police Department | ID |
22800 rows × 10 columns
data_pd.head()
| person | dept | eow | cause | cause_short | date | year | canine | dept_name | state | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Constable Darius Quimby | Albany County Constable's Office, NY | EOW: Monday, January 3, 1791 | Cause of Death: Gunfire | Gunfire | 1791-01-03 | 1791 | False | Albany County Constable's Office | NY |
| 1 | Sheriff Cornelius Hogeboom | Columbia County Sheriff's Office, NY | EOW: Saturday, October 22, 1791 | Cause of Death: Gunfire | Gunfire | 1791-10-22 | 1791 | False | Columbia County Sheriff's Office | NY |
| 2 | Deputy Sheriff Isaac Smith | Westchester County Sheriff's Department, NY | EOW: Thursday, May 17, 1792 | Cause of Death: Gunfire | Gunfire | 1792-05-17 | 1792 | False | Westchester County Sheriff's Department | NY |
| 3 | Marshal Robert Forsyth | United States Department of Justice - United S... | EOW: Saturday, January 11, 1794 | Cause of Death: Gunfire | Gunfire | 1794-01-11 | 1794 | False | United States Department of Justice - United S... | US |
| 4 | Sheriff Robert Maxwell | Greenville County Sheriff's Office, SC | EOW: Sunday, November 12, 1797 | Cause of Death: Gunfire | Gunfire | 1797-11-12 | 1797 | False | Greenville County Sheriff's Office | SC |
data_pd.shape
(22800, 10)
data_pd.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 22800 entries, 0 to 22799 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 person 22800 non-null object 1 dept 22800 non-null object 2 eow 22800 non-null object 3 cause 22800 non-null object 4 cause_short 22800 non-null object 5 date 22800 non-null datetime64[ns] 6 year 22800 non-null int64 7 canine 22800 non-null bool 8 dept_name 22800 non-null object 9 state 22800 non-null object dtypes: bool(1), datetime64[ns](1), int64(1), object(7) memory usage: 1.6+ MB
22800 personnes en tout avec 9 informations par personne.
data_pd.isna().sum()
person 0 dept 0 eow 0 cause 0 cause_short 0 date 0 year 0 canine 0 dept_name 0 state 0 dtype: int64
Il ne nous manque aucune donnée.
nb_data = data_pd.shape[1] * data_pd.shape[0]
nb_data
228000
data_pd.duplicated().sum()
0
Nous n'avons aucunes données dupliquées
data_pd.describe()
| year | |
|---|---|
| count | 22800.000000 |
| mean | 1951.751272 |
| std | 38.233234 |
| min | 1791.000000 |
| 25% | 1924.000000 |
| 50% | 1952.000000 |
| 75% | 1983.000000 |
| max | 2016.000000 |
data_pd.canine.value_counts()
False 22537 True 263 Name: canine, dtype: int64
data_pd.canine.value_counts(normalize=True)*100 #Transformation %
False 98.846491 True 1.153509 Name: canine, dtype: float64
pd_canine_fig, pd_canine_ax = pyplot.subplots()
_ = data_pd.canine.value_counts().plot(ax=pd_canine_ax, kind="pie", autopct="%1.2f%%")
pd_state_fig, pd_state_ax = pyplot.subplots()
_ = data_pd.state.value_counts().plot(ax=pd_state_ax, kind="pie")
L'état comprenant le plus de morts de policiers est le Texas, suivi par la Californie et l'état de New York.
pd_cause_fig, pd_cause_ax = pyplot.subplots()
_ = data_pd.cause_short.value_counts().plot(ax=pd_cause_ax, kind="pie")
On peut voir que la cause de mort principale sont les armes à feu, ensuite les accidents de voiture.
data_pd["state"].value_counts()
TX 1880 CA 1639 NY 1574 US 1225 IL 1073 PA 901 OH 834 FL 820 KY 772 GA 698 MO 666 TN 607 MI 574 NC 549 AL 537 VA 523 OK 496 NJ 485 LA 470 IN 434 PR 363 SC 362 MA 360 MD 325 CO 305 WA 290 AR 284 RR 280 WI 275 AZ 273 MS 261 KS 257 MN 244 WV 208 IA 197 OR 182 NM 151 CT 144 DC 128 UT 128 MT 128 NV 120 NE 110 ME 86 ID 64 WY 57 HI 56 ND 55 SD 53 TR 51 RI 50 AK 48 NH 46 DE 41 VT 26 VI 14 GU 12 CZ 3 AS 3 MP 3 Name: state, dtype: int64
pd_state_data
| State | Number of deaths | |
|---|---|---|
| 51 | AK | 48 |
| 14 | AL | 537 |
| 26 | AR | 284 |
| 58 | AS | 3 |
| 29 | AZ | 273 |
| 1 | CA | 1639 |
| 24 | CO | 305 |
| 37 | CT | 144 |
| 57 | CZ | 3 |
| 38 | DC | 128 |
| 53 | DE | 41 |
| 7 | FL | 820 |
| 9 | GA | 698 |
| 56 | GU | 12 |
| 46 | HI | 56 |
| 34 | IA | 197 |
| 44 | ID | 64 |
| 4 | IL | 1073 |
| 19 | IN | 434 |
| 31 | KS | 257 |
| 8 | KY | 772 |
| 18 | LA | 470 |
| 22 | MA | 360 |
| 23 | MD | 325 |
| 43 | ME | 86 |
| 12 | MI | 574 |
| 32 | MN | 244 |
| 10 | MO | 666 |
| 59 | MP | 3 |
| 30 | MS | 261 |
| 40 | MT | 128 |
| 13 | NC | 549 |
| 47 | ND | 55 |
| 42 | NE | 110 |
| 52 | NH | 46 |
| 17 | NJ | 485 |
| 36 | NM | 151 |
| 41 | NV | 120 |
| 2 | NY | 1574 |
| 6 | OH | 834 |
| 16 | OK | 496 |
| 35 | OR | 182 |
| 5 | PA | 901 |
| 20 | PR | 363 |
| 50 | RI | 50 |
| 27 | RR | 280 |
| 21 | SC | 362 |
| 48 | SD | 53 |
| 11 | TN | 607 |
| 49 | TR | 51 |
| 0 | TX | 1880 |
| 3 | US | 1225 |
| 39 | UT | 128 |
| 15 | VA | 523 |
| 55 | VI | 14 |
| 54 | VT | 26 |
| 25 | WA | 290 |
| 28 | WI | 275 |
| 33 | WV | 208 |
| 45 | WY | 57 |
pd_state2_fig = px.choropleth(
pd_state_data,
locations="State",
color="Number of deaths", color_continuous_scale="spectral_r",
locationmode="USA-states",
labels={"Nombre de morts": "number_of_deaths"},
scope="usa",
title="Nombre de morts totales de policiers par état aux États-Unis"
)
pd_state2_fig.show()
On observe sur cette carte la répartition géographique du nombre de morts de policiers selon l'état. On ne peut pas vraiment conclure d'une corrélation entre la situation géographique d'un état et son nombre de policiers tués en service.
pd_state3_fig = px.bar(pd_state_data, x="State", y="Number of deaths", text_auto=".2s", title="Nombre de morts totales de policiers par état aux États-Unis")
pd_state3_fig.update_traces(textfont_size=20, textangle=0, textposition="outside", cliponaxis=False)
pd_state3_fig.show()
L'état du Texas est l'état qui comporte le plus de morts de policiers totales au cours de ce siècle dernier avec plus de 1900 décès totaux.
data_pd["year"].value_counts()
1930 323
1927 305
1932 301
1931 296
1924 289
...
1824 1
1827 1
1828 1
1830 1
1792 1
Name: year, Length: 202, dtype: int64
year_data = data_pd["year"].value_counts().rename_axis("year").reset_index(name="number_of_deaths").sort_values(by="year")
year_data
| year | number_of_deaths | |
|---|---|---|
| 181 | 1791 | 2 |
| 201 | 1792 | 1 |
| 186 | 1794 | 1 |
| 187 | 1797 | 1 |
| 188 | 1804 | 1 |
| ... | ... | ... |
| 67 | 2012 | 157 |
| 85 | 2013 | 141 |
| 62 | 2014 | 165 |
| 68 | 2015 | 157 |
| 121 | 2016 | 76 |
202 rows × 2 columns
pd_deaths_fig = px.bar(year_data, x="year", y="number_of_deaths", text_auto=".2s", title="Nombre de morts totales de policiers au cours du siècle dernier")
pd_deaths_fig.update_traces(textfont_size=20, textangle=0, textposition="outside", cliponaxis=False)
pd_deaths_fig.show()
On remarque que l'année 1930 a été la plus meurtrière pour les policiers aux États-Unis avec 323 morts de policiers pendant leur service. Cela peut s'expliquer par la surcharge de travail de la police d'État à cette époque. D'après un rapport de la police de New Jersey, "Dans les années 1930, il n'était pas rare que les policiers travaillent de douze à seize heures par jour, sans jour de repos, parfois plusieurs semaines d'affilée, pour accomplir leur mission de protection et de service."
statebyyear_data = (data_pd["state"].where(data_pd["year"] == 1930)).value_counts()
statebyyear_data
NY 37 IL 27 US 21 TX 18 OK 17 MI 15 PA 11 CA 11 NJ 10 TN 9 RR 9 VA 9 MA 8 MO 8 KY 7 OH 7 FL 7 IA 7 IN 6 NC 6 GA 6 KS 6 WI 6 MN 5 LA 4 WA 4 AL 4 WV 3 ME 3 OR 3 PR 3 AR 3 NE 3 CT 3 CO 2 RI 2 NM 2 DC 2 MS 2 ND 2 MD 1 AZ 1 HI 1 SC 1 MT 1 Name: state, dtype: int64
On peut observer que cette année-là, en 1930, c'est l'état de New York qui présente le plus de morts de policiers.
y_causes = pandas.DataFrame()
ny_causes = data_pd["cause_short"][(data_pd.year == 1930) & (data_pd.state.str.contains("NY"))].value_counts()
ny_causes= ny_causes.rename_axis("cause").reset_index(name="number_of_deaths")
ny_causes
| cause | number_of_deaths | |
|---|---|---|
| 0 | Gunfire | 11 |
| 1 | Motorcycle accident | 9 |
| 2 | Automobile accident | 5 |
| 3 | Struck by vehicle | 4 |
| 4 | Heart attack | 2 |
| 5 | Gunfire (Accidental) | 1 |
| 6 | Stabbed | 1 |
| 7 | Vehicular assault | 1 |
| 8 | Fall | 1 |
| 9 | Electrocuted | 1 |
| 10 | Vehicle pursuit | 1 |
pd_cause2_fig = px.bar(ny_causes, x="cause", y="number_of_deaths", text_auto=".2s", title="Causes de mort de policier dans l'état de New York en 1930")
pd_cause2_fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
pd_cause2_fig.show()
La plupart de ces morts sont dues à des armes à feu, ce sont donc des policiers tombés lors de leur service durant des fusillades ou attaques à armes à feu.
causes_data = data_pd["cause_short"].value_counts()
causes_data = causes_data.rename_axis("cause").reset_index(name="number_of_deaths")
causes_data
| cause | number_of_deaths | |
|---|---|---|
| 0 | Gunfire | 12067 |
| 1 | Automobile accident | 2348 |
| 2 | Motorcycle accident | 1134 |
| 3 | Heart attack | 977 |
| 4 | Vehicular assault | 888 |
| 5 | Struck by vehicle | 868 |
| 6 | Vehicle pursuit | 627 |
| 7 | Assault | 613 |
| 8 | Gunfire (Accidental) | 604 |
| 9 | Stabbed | 465 |
| 10 | Aircraft accident | 381 |
| 11 | Drowned | 262 |
| 12 | Struck by train | 254 |
| 13 | Fall | 197 |
| 14 | Duty related illness | 169 |
| 15 | 9/11 related illness | 110 |
| 16 | Terrorist attack | 85 |
| 17 | Electrocuted | 83 |
| 18 | Animal related | 82 |
| 19 | Accidental | 80 |
| 20 | Struck by streetcar | 68 |
| 21 | Fire | 63 |
| 22 | Heat exhaustion | 61 |
| 23 | Explosion | 58 |
| 24 | Bomb | 57 |
| 25 | Exposure to toxins | 44 |
| 26 | Weather/Natural disaster | 33 |
| 27 | Training accident | 28 |
| 28 | Structure collapse | 23 |
| 29 | Train accident | 18 |
| 30 | Exposure | 14 |
| 31 | Boating accident | 13 |
| 32 | Bicycle accident | 13 |
| 33 | Unidentified | 7 |
| 34 | Poisoned | 4 |
| 35 | Asphyxiation | 2 |
pd_cause3_fig = px.bar(causes_data, x="cause", y="number_of_deaths", text_auto=".2s", title="Causes de mort de policier pendant leur service")
pd_cause3_fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
pd_cause3_fig.show()
Nous importons le fichier CSV en spécifiant les types de chaque colonne. Ainsi, le chargement est plus rapide, car pandas n'as pas besoin de deviner le type de chacune des colonnes. Et cela permet aussi définir le type que nous souhaitons avoir, laissant ainsi moins de place au hasard.
data_fe = pandas.read_csv("../data/FatalEncounters.csv", dtype={
"Unique ID": float,
"Name": str,
"Age": str,
"Gender": "category",
"Race": "category",
"Race with imputations": "category",
"URL of image (PLS NO HOTLINKS)": str,
"Location of injury (address)": str,
"Location of death (city)": str,
"State": str,
"Location of death (zip code)": str,
"Location of death (country)": str,
"Full Address": str,
"Latitude": str,
"Longitude": float,
"Agency or agencies involved": str,
"Highest level of force": "category",
"UID Temporary": str,
"Name Temporary": str,
"Armed/Unarmed": str,
"Alleged weapon": str,
"Aggressive physical movement": str,
"Fleeing/Not fleeing": str,
"Description Temp": str,
"URL Temp": str,
"Brief description": str,
"Dispositions/Exclusions INTERNAL USE, NOT FOR ANALYSIS": object,
"Intended use of force (Developing)": str,
"Supporting document link": str,
"Foreknowledge of mental illness? INTERNAL USE, NOT FOR ANALYSIS": object,
"Unnamed: 32": float,
"Unnamed: 33": float,
"Unique ID formula": float,
"Unique identifier (redundant)": float
}, parse_dates=[" Date of injury resulting in death (month/day/year)"], date_parser=pandas.to_datetime)
Nous pouvons noter que certaines des colonnes n'ont pas un type qui semble adéquat. Par exemple, la latitude qui est une str plutôt qu'un float. En effet, cette colonne comporte des données erronées qui empêche d'interpréter chacune de ses valeurs comme un float. Nous réglerons le problème après avoir préparé les données.
print(f"Shape : {Fore.LIGHTGREEN_EX}{data_fe.shape}{Fore.RESET}")
print(data_fe.head(n=5))
print(f"Type des colonnes : {Style.DIM}{Fore.WHITE}\n{data_fe.dtypes}{Style.RESET_ALL}")
Shape : (31498, 36) Unique ID Name Age Gender Race \ 0 25746.0 Samuel H. Knapp 17 Male European-American/White 1 25747.0 Mark A. Horton 21 Male African-American/Black 2 25748.0 Phillip A. Blurbridge 19 Male African-American/Black 3 25749.0 Mark Ortiz 23 Male Hispanic/Latino 4 1.0 LaTanya Janelle McCoy 24 Female African-American/Black Race with imputations Imputation probability \ 0 European-American/White Not imputed 1 African-American/Black Not imputed 2 African-American/Black Not imputed 3 Hispanic/Latino Not imputed 4 African-American/Black Not imputed URL of image (PLS NO HOTLINKS) \ 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN Date of injury resulting in death (month/day/year) \ 0 2000-01-01 1 2000-01-01 2 2000-01-01 3 2000-01-01 4 2000-01-02 Location of injury (address) ... URL Temp \ 0 27898-27804 US-101 ... NaN 1 Davison Freeway ... NaN 2 Davison Freeway ... NaN 3 600 W Cherry Ln ... NaN 4 5700 block Mack Road ... NaN Brief description \ 0 Samuel Knapp was allegedly driving a stolen ve... 1 Two Detroit men killed when their car crashed ... 2 Two Detroit men killed when their car crashed ... 3 A motorcycle was allegedly being driven errati... 4 LaTanya Janelle McCoy's car was struck from be... Dispositions/Exclusions INTERNAL USE, NOT FOR ANALYSIS \ 0 Unreported 1 Unreported 2 Unreported 3 Unreported 4 Unknown Intended use of force (Developing) \ 0 Pursuit 1 Pursuit 2 Pursuit 3 Pursuit 4 Pursuit Supporting document link \ 0 https://drive.google.com/file/d/10DisrV8K5ReP1... 1 https://drive.google.com/file/d/1-nK-RohgiM-tZ... 2 https://drive.google.com/file/d/1-nK-RohgiM-tZ... 3 https://drive.google.com/file/d/1qAEefRjX_aTtC... 4 http://www.recordnet.com/article/20000110/A_NE... Foreknowledge of mental illness? INTERNAL USE, NOT FOR ANALYSIS \ 0 No 1 No 2 No 3 No 4 No Unnamed: 32 Unnamed: 33 Unique ID formula Unique identifier (redundant) 0 NaN NaN NaN 25746.0 1 NaN NaN NaN 25747.0 2 NaN NaN NaN 25748.0 3 NaN NaN NaN 25749.0 4 NaN NaN NaN 1.0 [5 rows x 36 columns] Type des colonnes : Unique ID float64 Name object Age object Gender category Race category Race with imputations category Imputation probability object URL of image (PLS NO HOTLINKS) object Date of injury resulting in death (month/day/year) datetime64[ns] Location of injury (address) object Location of death (city) object State object Location of death (zip code) object Location of death (county) object Full Address object Latitude object Longitude float64 Agency or agencies involved object Highest level of force category UID Temporary object Name Temporary object Armed/Unarmed object Alleged weapon object Aggressive physical movement object Fleeing/Not fleeing object Description Temp object URL Temp object Brief description object Dispositions/Exclusions INTERNAL USE, NOT FOR ANALYSIS object Intended use of force (Developing) object Supporting document link object Foreknowledge of mental illness? INTERNAL USE, NOT FOR ANALYSIS object Unnamed: 32 float64 Unnamed: 33 float64 Unique ID formula float64 Unique identifier (redundant) float64 dtype: object
Nous cherchons ici à vérifier que ces deux colonnes sont uniques. Nous nous attendons à ce que la colonne "Unique ID" soit unique. Cependant, le nom peut comporter quelques doublons.
def is_column_unique(col_name, display_name=None):
display_name = display_name if display_name else col_name
is_unique = data_fe[col_name].is_unique
print(f"Est-ce que \"{display_name}\" est unique ? : {Fore.LIGHTGREEN_EX}{is_unique}{Fore.RESET}")
if not is_unique:
columns_to_print = ["Unique ID"]
if col_name not in columns_to_print:
columns_to_print.append(col_name)
duplicated = data_fe[data_fe.duplicated(subset=[col_name], keep=False)][columns_to_print]
duplicated["Count"] = duplicated.groupby(col_name)["Unique ID"].transform("count")
print((
f"Les valeurs non uniques de \"{display_name}\" sont:\n"
f"{Style.DIM}{Fore.WHITE}{duplicated.drop_duplicates([col_name])}{Style.RESET_ALL}"
))
is_column_unique("Unique ID", display_name="id")
is_column_unique("Name")
Est-ce que "id" est unique ? : True Est-ce que "Name" est unique ? : False Les valeurs non uniques de "Name" sont: Unique ID Name Count 22 20792.0 Name withheld by police 1391 98 59.0 Donald Moore 2 195 28746.0 John Doe 1 5 196 28747.0 John Doe 2 5 197 28748.0 John Doe 3 4 ... ... ... ... 28227 28093.0 William Johnson Jr. 2 28248 28113.0 Dion Johnson 2 30806 30781.0 Eric Padilla 2 30820 30790.0 Devonte Dawayne Brown 2 31134 31134.0 Christopher Ryan Vasquez 2 [208 rows x 3 columns]
Nous allons maintenant essayer de voir la quantité (en pourcentage) de valeurs manquantes pour chacune des colonnes de notre jeu de données.
print((
"Pourcentage total de valeur manquante : "
f"{Fore.LIGHTGREEN_EX}{round(((data_fe.isna().sum().sum() / (data_fe.shape[0] * data_fe.shape[1])) * 100), 2)}%{Fore.RESET}"
))
percent_missing = round((data_fe.isna().sum() * 100) / len(data_fe), 2)
df_missing = pandas.DataFrame({"Column name": data_fe.columns, "Missing": percent_missing.values})
df_missing.sort_values("Missing", ascending=False, inplace=True)
print((
f"Pourcentage de valeur manquante par colonne :\n"
f"{Style.DIM}{Fore.WHITE}{tabulate(df_missing, showindex=False, headers=df_missing.columns)}{Style.RESET_ALL}"
))
Pourcentage total de valeur manquante : 24.75% Pourcentage de valeur manquante par colonne : Column name Missing --------------------------------------------------------------- --------- Unnamed: 33 100 Unnamed: 32 100 Unique ID formula 99.99 URL Temp 89.79 Description Temp 87.09 UID Temporary 82.45 Name Temporary 82.45 URL of image (PLS NO HOTLINKS) 53.24 Armed/Unarmed 45.78 Fleeing/Not fleeing 45.78 Alleged weapon 45.78 Aggressive physical movement 45.77 Age 3.88 Imputation probability 2.82 Race with imputations 2.77 Location of injury (address) 1.77 Location of death (zip code) 0.58 Gender 0.46 Agency or agencies involved 0.25 Foreknowledge of mental illness? INTERNAL USE, NOT FOR ANALYSIS 0.2 Location of death (city) 0.11 Location of death (county) 0.05 Dispositions/Exclusions INTERNAL USE, NOT FOR ANALYSIS 0.01 Intended use of force (Developing) 0.01 Supporting document link 0.01 Brief description 0.01 Highest level of force 0.01 Unique ID 0 Name 0 Longitude 0 Latitude 0 Full Address 0 State 0 Date of injury resulting in death (month/day/year) 0 Race 0 Unique identifier (redundant) 0
Nous allons ensuite supprimer toutes les colonnes avec plus de x% (threshold) de données manquantes.
# Drop all columns with more than `drop_threshold`% of missing data
drop_threshold = 50
drop_columns = df_missing[df_missing["Missing"] >= drop_threshold]["Column name"].tolist()
if len(drop_columns) > 0:
print((
f"{Fore.YELLOW}Suppression des colonnes avec plus de {drop_threshold}% de données manquantes. Suppression de "
f"{drop_columns}...{Fore.RESET}"
))
drop_prev_shape = data_fe.shape
data_fe.drop(columns=drop_columns, axis=1, inplace=True)
drop_new_shape = data_fe.shape
drop_count = drop_prev_shape[1] - drop_new_shape[1]
drop_plural = "s" if drop_count > 1 else ""
print(f"{Fore.YELLOW}{drop_count} colonne{drop_plural} supprimée{drop_plural}. Nouvelle \"shape\" : {drop_new_shape}{Fore.RESET}")
Suppression des colonnes avec plus de 50% de données manquantes. Suppression de ['Unnamed: 33', 'Unnamed: 32', 'Unique ID formula', 'URL Temp', 'Description Temp', 'UID Temporary', 'Name Temporary', 'URL of image (PLS NO HOTLINKS)']... 8 colonnes supprimées. Nouvelle "shape" : (31498, 28)
Pour clore ce premier aperçu, nous allons compter le nombre de données dupliquées.
duplicated_data = data_fe.duplicated().sum()
print((
f"Données dupliquées : {Fore.LIGHTGREEN_EX}{duplicated_data}{Fore.RESET}"
f" ({round((duplicated_data * 100) / data_fe.shape[0], 1)}%)"
))
Données dupliquées : 0 (0.0%)
Avant toute analyse approfondie des données, il faut les préparer. Et notamment régler les problèmes de valeur pour certaines colonnes.
Certaines colonnes comportent dans leur nom la mention "INTERNAL USE, NOT FOR ANALYSIS". Pour respecter cette mention, nous supprimons ces colonnes.
data_fe.drop(list(data_fe.filter(regex="INTERNAL USE, NOT FOR ANALYSIS")), axis=1, inplace=True)
Cette colonne ne pouvait pas être mise sous le type float car certaines de ses valeurs n'était pas numériques.
Certaines valeurs sont des intervalles d'âge. Deux choix sont alors possibles. Supprimer les lignes concernées ou transformer l'intervalle en moyenne. Nous avons choisi ici la deuxième solution. Pour régler le souci, nous appliquons une fonction de transformation sur la colonne.
def _transform_age(age):
if isinstance(age, str):
if "-" in age and not age.startswith("-"):
age_range = age.split("-")
return round(float((int(age_range[0]) + int(age_range[1])) / 2), 0)
else:
return round(float(age), 0)
return round(age, 0)
data_fe["Age"] = data_fe["Age"].map(lambda age: _transform_age(age))
Certaines latitudes comportent des caractères invalides. La solution la plus simple les concernant et de supprimer les lignes en question.
wrong_latitude_index = []
def _transform_latitude(lat):
if isinstance(lat, str):
try:
return float(lat)
except ValueError:
wrong_latitude_index.append(data_fe.index[data_fe["Latitude"] == lat])
return lat
return lat
data_fe["Latitude"] = data_fe["Latitude"].map(lambda lat: _transform_latitude(lat))
data_fe.drop([index for sublist in wrong_latitude_index for index in sublist], inplace=True)
data_fe = data_fe.astype({"Latitude": float})
Cette colonne pose deux problèmes. Le premier est que certaines cellules sont vides où contienne des valeurs incorrectes. Le second concerne la manière dont les ethnies sont renseignée. Chaque personne est caractérisée par une série d'ethnies séparées par un slash (/). Cependant, ce format rend compliqué l'analyse des données. Nous allons donc procéder à un One Hot Encoding afin de remplacer cette colonne par plusieurs colonnes correspondant à chacune des ethnies. Les valeurs de ces colonnes seront binaire, c'est-à-dire 0 où 1.
race_uncertain = "Race unspecified"
data_fe["Race"].fillna(race_uncertain, inplace=True)
data_fe["Race"] = data_fe["Race"].map(lambda race: race.title())
data_fe.loc[(data_fe["Race"] == "Christopher Anthony Alexander"), "Race"] = race_uncertain
data_fe = pandas.concat([
data_fe.drop("Race", axis=1),
data_fe["Race"].str.get_dummies(sep="/").add_prefix("Race_")
], axis=1)
Cette colonne, tout comme "Fleeing/Not fleeing", et une colonne qui comporte un nombre important de valeurs manquantes. Cependant, elle peut être très utile à l'étude. Pour chacune des personnes, cette colonne à pour valeur "Armed" (armé), "Unarmed" (non armé) ou "Uncertain" (incertain). Nous allons donc remplacer les valeurs incorrectes ou nulles par "Uncertain".
armed_uncertain = "Uncertain"
def _transform_is_armed(is_armed):
if isinstance(is_armed, str):
if (is_armed.lower() == "armed") or (is_armed.lower() == "unarmed"):
return is_armed
elif re.match(r"duplicate|none", is_armed, flags=re.IGNORECASE):
return armed_uncertain
else:
return armed_uncertain
else:
return armed_uncertain
data_fe["Armed/Unarmed"] = data_fe["Armed/Unarmed"].map(lambda is_armed: _transform_is_armed(is_armed))
data_fe = data_fe.astype({"Armed/Unarmed": "category"})
Nous avons le même intérêt pour cette colonne que "Armed/Unarmed", et elle comporte le même problème. Nous allons donc procéder de manière similaire.
fleeing_uncertain = "Uncertain"
def _transform_is_fleeing(is_fleeing):
if isinstance(is_fleeing, str):
if re.match(r"duplicate|none", is_fleeing, flags=re.IGNORECASE):
return fleeing_uncertain
else:
parts = is_fleeing.lower().split("/")
if "fleeing" in parts:
return "Fleeing"
elif "not fleeing" in parts:
return "Not fleeing"
elif fleeing_uncertain.lower() in parts:
return fleeing_uncertain
else:
return fleeing_uncertain
else:
return fleeing_uncertain
data_fe["Fleeing/Not fleeing"] = data_fe["Fleeing/Not fleeing"].map(lambda is_fleeing: _transform_is_fleeing(is_fleeing))
data_fe = data_fe.astype({"Fleeing/Not fleeing": "category"})
Cette colonne comporte un espace en trop au début. Nous réglons le problème et en profitons pour la renommer en quelque-chose de plus concis.
data_fe.rename({" Date of injury resulting in death (month/day/year)": "Date of death"}, axis=1, inplace=True)
Maintenant que nos transformations sont terminées, nous pouvons commencer à exploiter ce jeu de données pour une analyse statistique.
freq_columns = ["Gender", "State", "Armed/Unarmed", "Fleeing/Not fleeing"]
freq_n_cols = 2
freq_n_rows = math.ceil(len(freq_columns) / freq_n_cols)
freq_fig = pyplot.figure()
freq_fig.suptitle("Fréquence des données qualitative")
for index, col in enumerate(freq_columns):
ax = pyplot.subplot(freq_n_rows, freq_n_cols, index + 1)
data_count = data_fe[[col]].value_counts()
labels = [label[0] for label in data_count.index.tolist()]
bars = data_count.plot(ax=ax, kind="bar", legend=True, label=col.title())
ax.set_xlabel("")
ax.set_xticks(ticks=numpy.arange(0, len(data_count)), labels=labels, rotation=45, fontsize=10)
for bars in ax.containers:
ax.bar_label(bars)
pyplot.tight_layout()
pyplot.show()
Notre jeu de données comporte assez peu de données numériques.
print(data_fe.dtypes)
Unique ID float64 Name object Age float64 Gender category Race with imputations category Imputation probability object Date of death datetime64[ns] Location of injury (address) object Location of death (city) object State object Location of death (zip code) object Location of death (county) object Full Address object Latitude float64 Longitude float64 Agency or agencies involved object Highest level of force category Armed/Unarmed category Alleged weapon object Aggressive physical movement object Fleeing/Not fleeing category Brief description object Intended use of force (Developing) object Supporting document link object Unique identifier (redundant) float64 Race_African-American int64 Race_Alaskan int64 Race_Asian int64 Race_Black int64 Race_Black African-American int64 Race_Black Not Imputed int64 Race_European-American int64 Race_Hispanic int64 Race_Latino int64 Race_Middle Eastern int64 Race_Native American int64 Race_Pacific Islander int64 Race_Race Unspecified int64 Race_Race unspecified int64 Race_White int64 dtype: object
Le "Unique ID" n'est qu'un identifiant et une analyse de moyenne sur une latitude et une longitude n'a pas vraiment de sens. La seule colonne analysée sera alors "Age".
numerical_columns = ["Age"]
for column in numerical_columns:
# noinspection PyTypeChecker
mmmssk_fig, (mmmssk_box_ax, mmmssk_hist_ax) = pyplot.subplots(
nrows=2, ncols=1,
sharex=True,
gridspec_kw={"height_ratios": (0.2, 1)}
)
mmmssk_fig.suptitle(f"Statistiques de la colonne \"{column}\"")
mean = data_fe[column].mean()
median = data_fe[column].median()
mode = data_fe[column].mode().values[0]
data_fe[column].plot(ax=mmmssk_box_ax, kind="box", vert=False, legend=True)
mmmssk_box_ax.axvline(mean, color="darkred", linestyle="--")
mmmssk_box_ax.axvline(median, color="darkgreen", linestyle="-")
mmmssk_box_ax.axvline(mode, color="darkblue", linestyle="-")
data_fe[column].plot(ax=mmmssk_hist_ax, kind="hist", legend=True)
mmmssk_hist_ax.axvline(mean, color="darkred", linestyle="--", label="Moyenne")
mmmssk_hist_ax.axvline(median, color="darkgreen", linestyle="-", label="Médiane")
mmmssk_hist_ax.axvline(mode, color="darkblue", linestyle="-", label="Mode")
mmmssk_box_ax.set_yticklabels([""])
mmmssk_hist_ax.legend()
standard_deviation = data_fe[column].std()
skewness = data_fe[column].skew()
kurtosis = data_fe[column].kurtosis()
pyplot.figtext(0.68, 0.58, f"Autres stats :", fontsize=10, fontweight="bold")
pyplot.figtext(0.69, 0.54, f"Écart type : {round(standard_deviation, 3)}", fontsize=10)
pyplot.figtext(0.69, 0.5, f"Skewness : {round(skewness, 3)}", fontsize=10)
pyplot.figtext(0.69, 0.46, f"Kurtosis : {round(kurtosis, 3)}", fontsize=10)
pyplot.tight_layout()
pyplot.subplots_adjust(right=0.67)
pyplot.show()
Nous allons chercher à voir la répartition de ces confrontations selon plusieurs variables. Nous pourrons ainsi voir quelles sont les personnes les plus touchés par ça.
repart_fig = pyplot.figure()
repart_size = (2, 2)
repart_grid = repart_fig.add_gridspec(repart_size[0], repart_size[1])
_ = repart_fig.suptitle("Répartition de rencontres fatales")
male_labels = data_fe["Age"][data_fe["Gender"] == "Male"].drop_duplicates(keep="first", inplace=False).dropna().sort_values()
male_values = data_fe["Age"][data_fe["Gender"] == "Male"].value_counts().sort_index()
female_labels = data_fe["Age"][data_fe["Gender"] == "Female"].drop_duplicates(keep="first", inplace=False).dropna().sort_values()
female_values = data_fe["Age"][data_fe["Gender"] == "Female"].value_counts().sort_index()
trans_labels = data_fe["Age"][data_fe["Gender"] == "Transgender"].drop_duplicates(keep="first", inplace=False).dropna().sort_values()
trans_values = data_fe["Age"][data_fe["Gender"] == "Transgender"].value_counts().sort_index()
gender_ax = pyplot.subplot(repart_grid[0, :-1])
gender_ax.plot(male_labels, male_values.tolist(), label="Homme")
gender_ax.plot(female_labels, female_values.tolist(), label="Femme")
gender_ax.plot(trans_labels, trans_values.tolist(), label="Transgenre")
gender_ax.set_xlabel("Âge")
_ = gender_ax.legend()
# Revert one hot encoding
ethnic_data = data_fe[[column for column in data_fe if column.startswith("Race_")]].idxmax(1)
ethnic_data = ethnic_data.map(lambda ethnic: ethnic.replace("Race_", "").title())
ethnic_values = ethnic_data.value_counts().sort_values(ascending=False)
ethnic_labels = pandas.Series(data=ethnic_values.index, index=ethnic_values)
ethnic_ax = pyplot.subplot(repart_grid[1, :-1])
ethnic_ax.bar(ethnic_labels, ethnic_values.tolist())
_ = ethnic_ax.set_xticks(
ticks=numpy.arange(0, len(ethnic_labels)),
labels=ethnic_labels,
rotation=45, fontsize=8, ha="right"
)
# Fatal encounters per state
def autopct_func(pct):
return f"{round(pct, 2)}%" if pct > 5 else ""
# Get states
states = data_fe["State"].sort_values()
states_values = states.value_counts().sort_values(ascending=False)
# Merge states with low values
states_threshold = 1
states_to_merge = states_values[states_values <= ((states_values.sum() * states_threshold) / 100)]
states_values = pandas.concat([
states_values.drop(states_values.tail(n=len(states_to_merge)).index),
pandas.Series(data=sum(states_to_merge), index=["Reste"])
])
# Get states labels
states_labels = pandas.Series(data=states_values.index, index=states_values)
# Plot
states_ax = pyplot.subplot(repart_grid[0:, -1])
states_ax.pie(states_values.tolist(), labels=states_labels, autopct=autopct_func, textprops={"fontsize": 8})
states_ax.axis("equal")
_ = states_ax.set_title("Répartition par État")
Enfin, il ne reste qu'à afficher les graphiques.
pyplot.tight_layout()
pyplot.show()
# Contingency table
contingency = pandas.crosstab(
index=data_fe["Fleeing/Not fleeing"],
columns=data_fe["Armed/Unarmed"],
normalize="index"
).round(4)
contingency_numpy = contingency.to_numpy()
chi_labels_armed = contingency.columns.tolist()
chi_labels_fleeing = contingency.index.tolist()
# Plot to heatmap
chi_fig, chi_ax = pyplot.subplots()
chi_ax.imshow(contingency_numpy)
chi_ax.set_xticks(numpy.arange(len(chi_labels_armed)), labels=chi_labels_armed)
chi_ax.set_yticks(numpy.arange(len(chi_labels_fleeing)), labels=chi_labels_fleeing)
for i in range(len(chi_labels_fleeing)):
for j in range(len(chi_labels_armed)):
chi_ax.text(j, i, contingency_numpy[i, j], ha="center", va="center", color="w")
# Interpret statistics
stat, pvalue, dof, expected = scipy.stats.chi2_contingency(contingency)
print(f"Degrés de liberté : {Fore.LIGHTGREEN_EX}{dof}{Fore.RESET}")
print(f"Fréquence attendue : {Fore.LIGHTGREEN_EX}{expected}{Fore.RESET}")
prob = 0.95
critical = scipy.stats.chi2.ppf(prob, dof)
print((
f"Interprétation de test-statistic: "
f"{Style.DIM}{Fore.WHITE}(Probability: {round(prob, 3)} | Critical: {round(critical, 3)} | Stat: {round(stat, 3)}) "
f"{Style.RESET_ALL}{Fore.LIGHTGREEN_EX}>>> Dépendant: {'Oui' if (abs(stat) >= critical) else 'Non'}{Fore.RESET}"
))
alpha = 1.0 - prob
print((
f"Interprétation de p-value: "
f"{Style.DIM}{Fore.WHITE}(Significance: {round(alpha, 3)} | P-Value: {round(pvalue, 3)}) "
f"{Style.RESET_ALL}{Fore.LIGHTGREEN_EX}>>> Dépendant: {'Oui' if (pvalue <= alpha) else 'Non'}{Fore.RESET}"
))
# Finish plot
chi_ax.set_title(
"Test du khi-deux : Indépendance de \"Armed/Unarmed\" et \"Fleeing/Not fleeing\"",
loc="center",
wrap=True
)
chi_ax.grid(False)
chi_fig.tight_layout()
chi_fig.subplots_adjust(top=0.85)
pyplot.show()
Degrés de liberté : 4 Fréquence attendue : [[0.38477949 0.26884229 0.34637821] [0.38477949 0.26884229 0.34637821] [0.38474101 0.26881541 0.34634357]] Interprétation de test-statistic: (Probability: 0.95 | Critical: 9.488 | Stat: 3.038) >>> Dépendant: Non Interprétation de p-value: (Significance: 0.05 | P-Value: 0.552) >>> Dépendant: Non
tax_data = pandas.read_excel(
"../data/TaxPolicyCenter.xls",
skiprows=5, skipfooter=4, # On ne prend pas en compte les lignes informatiques du header et du footer
header=[0,1,2], # Les noms des colonnes sont sur trois lignes, nous traiterons donc ces lignes comme une seule ligne
sheet_name=None
)
tax_sample = tax_data["2019"]
La boucle réalisera des traitements sur chaque feuille du fichier. D'abord, nous renommons les colonnes (puisque le header se compose de trois lignes) pour obtenir des noms cohérents. Nous récupérons également quelques données qui seront utilisées plus tard pour les graphiques et certains calculs.
tax_total_data = 0
total_income_list = []
police_budget = []
budget_years = []
print("===================Data INFO=====================")
tax_sample.info() #71 états/collectivités avec 11 colonnes de dépenses
for sheet in tax_data :
income = 0
df = tax_data[sheet]
df.columns = ["_".join(index_str) for index_str in df.columns.to_flat_index()]
df.rename(columns={
"Unnamed: 0_level_0_Region and State_Unnamed: 0_level_2": "Region and State","Unnamed: 1_level_0_Region and State_Total": "Total",
"Unnamed: 2_level_0_Inter-_governmental": "Inter-governmental",
"Unnamed: 3_level_0_Inter-_Total": "Direct Total",
"Elementary_and Secondary_Education": "Elementary and Secondary Education",
"Elementary_Higher_Education": "Higher Education",
"Elementary_Public_Welfare": "Public Welfare",
"Elementary_Health and_Hospitals": "Health and Hospitals",
"Elementary_Health and_Highways": "Highways",
"Elementary_Health and_Police": "Police",
"Elementary_All_Other": "All Other",
"Personal_Income [1]_(Millions)": "Personal Income [1] (Millions)"
}, inplace=True)
print("===================Données manquantes pour " + sheet + " ?=====================")
print(df.isna().sum()) # On vérifie si des valeurs sont manquantes
df.dropna(inplace=True) # Suppression des lignes avec une valeur manquantes
tax_total_data += df.shape[1]*df.shape[0] # On ajoute le total des données de la feuille dans une variable
total_income_list.insert(0,(df.iloc[0]["Personal Income [1] (Millions)"])) # On ajoute
police_budget.insert(0,(df.iloc[0]["Personal Income [1] (Millions)"])/100*(df.iloc[0]["Police"]*100))
budget_years.insert(0, sheet)
tax_sample["All years income"] = pandas.Series(total_income_list)
tax_sample["Police budget evolution"] = pandas.Series(police_budget)
===================Data INFO===================== <class 'pandas.core.frame.DataFrame'> RangeIndex: 71 entries, 0 to 70 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 (Unnamed: 0_level_0, Region and State, Unnamed: 0_level_2) 60 non-null object 1 (Unnamed: 1_level_0, Region and State, Total) 60 non-null float64 2 (Unnamed: 2_level_0, Inter-, governmental) 60 non-null float64 3 (Unnamed: 3_level_0, Inter-, Total) 60 non-null float64 4 (Elementary, and Secondary, Education) 60 non-null float64 5 (Elementary, Higher, Education) 60 non-null float64 6 (Elementary, Public, Welfare) 60 non-null float64 7 (Elementary, Health and, Hospitals) 60 non-null float64 8 (Elementary, Health and, Highways) 60 non-null float64 9 (Elementary, Health and, Police) 60 non-null float64 10 (Elementary, All, Other) 60 non-null float64 11 (Personal, Income [1], (Millions)) 60 non-null float64 dtypes: float64(11), object(1) memory usage: 6.8+ KB ===================Données manquantes pour 2019 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2018 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2017 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2016 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2015 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2014 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2013 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2012 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2011 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2010 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2009 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2008 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2007 ?===================== Region and State 11 Total 11 Inter-governmental 11 Direct Total 11 Elementary and Secondary Education 11 Higher Education 11 Public Welfare 11 Health and Hospitals 11 Highways 11 Police 11 All Other 11 Personal Income [1] (Millions) 11 dtype: int64 ===================Données manquantes pour 2006 ?===================== Region and State 11 Total 12 Inter-governmental 12 Direct Total 12 Elementary and Secondary Education 12 Higher Education 12 Public Welfare 12 Health and Hospitals 12 Highways 12 Police 12 All Other 12 Personal Income [1] (Millions) 12 dtype: int64 ===================Données manquantes pour 2005 ?===================== Region and State 11 Total 12 Inter-governmental 12 Direct Total 12 Elementary and Secondary Education 12 Higher Education 12 Public Welfare 12 Health and Hospitals 12 Highways 12 Police 12 All Other 12 Personal Income [1] (Millions) 12 dtype: int64 ===================Données manquantes pour 2004 ?===================== Region and State 11 Total 12 Inter-governmental 12 Direct Total 12 Elementary and Secondary Education 12 Higher Education 12 Public Welfare 12 Health and Hospitals 12 Highways 12 Police 12 All Other 12 Personal Income [1] (Millions) 12 dtype: int64
print("===================Total des données=====================")
print(tax_total_data)
===================Total des données===================== 11520
Nous affichons des statistiques sur toutes les colonnes. Celles qui nous intéressent le plus sont les colonnes Police, All years income et Police budget evolution.
print("===================Description et stats=====================")
print(tax_sample.describe())
===================Description et stats=====================
Total Inter-governmental Direct Total \
count 60.000000 60.000000 60.000000
mean 18.281756 0.007065 18.274691
std 3.785900 0.020820 3.783119
min 0.180093 0.000000 0.179921
25% 16.517344 0.000000 16.517344
50% 18.147631 0.000018 18.147577
75% 19.794667 0.001935 19.794667
max 28.095157 0.108886 28.095157
Elementary and Secondary Education Higher Education Public Welfare \
count 60.000000 60.000000 60.000000
mean 3.867793 1.767521 4.031424
std 0.797018 0.613697 1.352895
min 0.038711 0.016792 0.040109
25% 3.579619 1.280230 3.017075
50% 3.871914 1.771184 3.819156
75% 4.202333 2.124720 5.059843
max 6.312585 3.302401 6.962010
Health and Hospitals Highways Police All Other \
count 60.000000 60.000000 60.000000 60.000000
mean 1.632759 1.243307 0.628770 5.103117
std 0.915194 0.477030 0.150432 1.397332
min 0.017351 0.010923 0.006636 0.049399
25% 1.013766 0.968849 0.556597 4.304009
50% 1.496192 1.096752 0.618976 4.935675
75% 2.058508 1.505501 0.700228 5.447679
max 4.788240 2.865624 1.116419 10.099438
Personal Income [1] (Millions) All years income \
count 6.000000e+01 1.300000e+01
mean 9.250019e+05 1.433573e+07
std 2.482756e+06 2.489145e+06
min 3.450230e+04 1.060560e+07
25% 1.018024e+05 1.249270e+07
50% 3.042645e+05 1.399838e+07
75% 6.395509e+05 1.615188e+07
max 1.854226e+07 1.854226e+07
Police budget evolution
count 13.000000
mean 99998.421001
std 13770.066801
min 74133.109050
25% 95017.083885
50% 96875.949535
75% 109116.131840
max 123046.265209
Bien que les statistiques précédentes nous montrent une moyenne, une médiane et d'autres informations, nous les affichons tout de même ici sous une forme plus lisible.
print("===================Moyenne=====================")
print("All years income :", tax_sample["All years income"].mean())
print("Police budget :", tax_sample["Police budget evolution"].mean())
===================Moyenne===================== All years income : 14335731.153846154 Police budget : 99998.42100129538
print("===================Médiane=====================")
print("All years income :", tax_sample["All years income"].median())
print("Police budget :", tax_sample["Police budget evolution"].median())
===================Médiane===================== All years income : 13998383.0 Police budget : 96875.94953533
print("===================Skew=====================")
print("All years income :", tax_sample["All years income"].skew())
print("Police budget :", tax_sample["Police budget evolution"].skew())
===================Skew===================== All years income : 0.3442130183886384 Police budget : -0.025606281986510004
Pour les graphiques nous avons choisi un graphique de courbe, car il est le plus explicite concernant les données sur une période en années. C'est ici que nous utilisons les listes contenant les valeurs récupérées lors de la boucle.
def formatter_1e6(x): # Pour l'affichage simplifié de l'axe Y
return str(round(x / 1e6, 1))
tpc_total_fig, tpc_total_ax = pyplot.subplots()
tpc_total_ax.plot(numpy.array(budget_years), numpy.array(total_income_list))
# tpc_total_ax.yaxis.set_major_formatter(formatter_1e6)
tpc_total_ax.yaxis.set_minor_formatter(NullFormatter)
tpc_total_ax.set_xlabel("Année")
tpc_total_ax.set_ylabel("Budget")
tpc_total_ax.text(0, 1.05, "en millions ($)", transform=pyplot.gca().transAxes, ha="left", va="top")
tpc_total_ax.set_title("Évolution du revenu intérieur des USA")
pyplot.show()
def formatter_1e4(x): # Pour l'affichage simplifié de l'axe Y
return str(round(x / 1e4, 1))
tpc_police_fig, tpc_police_ax = pyplot.subplots()
tpc_police_ax.plot(numpy.array(budget_years), numpy.array(police_budget))
# tpc_police_ax.yaxis.set_major_formatter(formatter_1e4)
tpc_police_ax.yaxis.set_minor_formatter(NullFormatter)
tpc_police_ax.set_title("Évolution du budget de la police aux USA")
tpc_police_ax.set_xlabel("Année")
tpc_police_ax.set_ylabel("Budget ($)")
pyplot.show()
class Map:
TILE_LAYERS = [
{"name": "openstreetmap", "display_name": "Open Street Map"},
{"name": "stamentoner", "display_name": "Stamen toner"},
{"name": "cartodbpositron", "display_name": "CartoDB (Light)"},
{"name": "cartodbdark_matter", "display_name": "CartoDB (Dark)"},
]
def __init__(self, crs="EPSG:3857", **kwargs):
self.crs = crs
self.map = folium.Map(tiles=None, crs=self.crs.replace(":", ""), **kwargs)
self.layers = []
def _add_tile_layers(self):
for layer in Map.TILE_LAYERS:
folium.TileLayer(layer["name"], name=layer["display_name"]).add_to(self.map)
def _add_map_layers(self):
for layer in self.layers:
for sublayer in layer.get_layers():
sublayer.add_to(self.map)
def _register_layer(self, layer):
if isinstance(layer, MapLayer):
self.layers.append(layer)
def fit_bounds(self, south_west, north_east):
self.map.fit_bounds([south_west, north_east])
def open(self, notebook=False, output_dir="./temp", filename=None):
self._add_tile_layers()
self._add_map_layers()
folium.LayerControl().add_to(self.map)
if notebook:
return self.map
path = os.path.join(output_dir, (filename if filename else f"map-{time.time()}.html"))
self.map.save(path)
webbrowser.open(path)
class MapLayer:
DATA_TYPES = {"DataFrame": 0, "Geo[GCS]": 1, "TimedGeo[GCS]": 1}
def __init__(self, name, show_default=False):
self.name = name
self.parent_map = None
self.feature_group = folium.FeatureGroup(self.name, overlay=True, show=show_default)
self.layers = []
self.data = None
self.data_type = False
def get_layers(self):
return self.layers
def add_to(self, m):
if isinstance(m, Map):
self.parent_map = m
# noinspection PyProtectedMember
self.parent_map._register_layer(self)
return self
def load_dataframe(self, data):
self.data = data
self.data_type = MapLayer.DATA_TYPES["DataFrame"]
return self
def load_gcs_data(self, data, col_names=None, time_column=None):
if time_column is None:
col_names = col_names if col_names else {"lat": "Latitude", "lon": "Longitude"}
data_coords = data[[col_names["lat"], col_names["lon"]]].dropna(axis=0, how="any")
geometry = geopandas.points_from_xy(data_coords[col_names["lon"]], data_coords[col_names["lat"]])
self.data = geopandas.GeoDataFrame(data, geometry=geometry, crs=self.parent_map.crs)
self.data_type = MapLayer.DATA_TYPES["Geo[GCS]"]
else:
data_coords = data[[time_column, col_names["lat"], col_names["lon"]]].dropna(axis=0, how="any")
data_dates = data[time_column].drop_duplicates()
data_timed = {}
for _, d in data_dates.iteritems():
data_timed[d.date().__str__()] = data_coords.loc[data_coords[time_column] == d][[col_names["lat"], col_names["lon"]]].values.tolist()
self.data = data_timed
self.data_type = MapLayer.DATA_TYPES["TimedGeo[GCS]"]
return self
def _add_to_layer(self, item):
item.add_to(self.feature_group)
self.layers.append(item)
def to_heatmap(self, **kwargs):
if not self.data_type == MapLayer.DATA_TYPES["Geo[GCS]"]:
raise RuntimeError("MapLayer: to_heatmap() is only available for GCS data.")
heat_data = [[point.xy[1][0], point.xy[0][0]] for point in self.data.geometry if not point.is_empty]
heat_map = folium.plugins.HeatMap(heat_data, name=self.name, **kwargs)
self._add_to_layer(heat_map)
return self
def to_timed_heatmap(self, **kwargs):
if not self.data_type == MapLayer.DATA_TYPES["TimedGeo[GCS]"]:
raise RuntimeError("MapLayer: to_timed_heatmap() is only available for timed GCS data.")
heat_map = folium.plugins.HeatMapWithTime(list(self.data.values()), index=list(self.data.keys()), name=self.name, **kwargs)
self._add_to_layer(heat_map)
def to_choropleth(self, key_on=None, fill_color=None, **kwargs):
if not self.data_type == MapLayer.DATA_TYPES["DataFrame"]:
raise RuntimeError("MapLayer: to_choropleth() is only available for pandas dataframes.")
choropleth = folium.Choropleth(
data=self.data,
key_on=(key_on if key_on else "feature.id"),
fill_color=(fill_color if fill_color else "YlOrRd"),
**kwargs
)
self._add_to_layer(choropleth)
return self
data_wp_geo = data_wp.loc[data_wp["is_geocoding_exact"]]
data_fe_geo = data_fe.loc[data_fe["Date of death"] >= datetime(2015, 1, 2)]
plot_map = Map()
encounter_layer = MapLayer("Fatal encounters", show_default=True)
encounter_layer.add_to(plot_map).load_gcs_data(data_fe, col_names={"lat": "Latitude", "lon": "Longitude"}, time_column="Date of death").to_timed_heatmap(
min_opacity=0.4, radius=20
)
police_death_layer = MapLayer("Police Deaths", show_default=False)
police_death_layer.add_to(plot_map).load_dataframe(pd_state_data).to_choropleth(
geo_data="https://raw.githubusercontent.com/python-visualization/folium/main/examples/data/us-states.json",
columns=["State", "Number of deaths"],
name="Police deaths",
legend_name="Police deaths",
key_on="feature.id",
fill_color="YlOrRd",
fill_opacity=0.7,
line_opacity=0.2
)
plot_map.open(notebook=True)